/* OPTIONS OBS=500 NOREPLACE  ; */  

%include "ASMimplibs.sas";

/** For 2002 and 2007, create two samples of tabulated, non-AR ready-mix concrete plants:

   (1) All observed and Census Bureau edited/imputed cases, with any missing/faulty values replaced 
       with imputations by the Census Bureau.
   (2) Data that we deem to be "good" data, i.e., as reported on
       the survey form and kept by the edit/impute process or 
       data for which we think the edited value is better than the 
       captured value (e.g., "analyst corrected").

    At the end, produce descriptive statistics for key variables in the Bureau-completed and
    "good" (complete cases) datasets.

	input files:
		concrete.CEA_cou_area_emp02
		concrete.CEA_cou_area_emp07
		concrete.concrete07
                concrete.concrete_cmf_captured02;

	output files:
		concrete.concrete_0207_gooddata
		concrete.concrete_0207_w_CB_imputes
**/


/* Merge the 2002 CEA-area-construction employment data with the 2002 concrete plant data. */

data CEA_cou_area_emp02;
 set concrete.CEA_cou_area_emp02;
run;
proc sort data = CEA_cou_area_emp02;
 by FIPSST COU;
RUN;
data concrete02;
 set concrete.concrete_cmf_captured02;
 year=2002;
run;
proc sort data=concrete02;
 by FIPSST COU;
run;


data concrete02_w_DD rej_cmf02 rej_CEA02;
 merge concrete02 (in=incmf) CEA_cou_area_emp02 (in=inCEA);
by FIPSST COU;
if incmf and inCEA then output concrete02_w_DD;
else if incmf then output rej_cmf02;
else output rej_CEA02;
run;


/* Merge the CEA-area-construction employment data with the 2007 concrete plant data. */

data concrete07;
 set concrete.concrete07;
 year=2007;
run;

proc sort data=concrete07;
 by FIPSST COU;
run;

data CEA_cou_area_emp07;
 set concrete.CEA_cou_area_emp07;
run;

proc sort data=CEA_cou_area_emp07;
 by FIPSST COU;
run;

data concrete07_w_DD rej_cmf rej_CEA;
 merge concrete07 (in=incmf) CEA_cou_area_emp07 (in=inCEA);
by FIPSST COU;
if incmf and inCEA then output concrete07_w_DD;
else if incmf then output rej_cmf;
else output rej_CEA;
run;



/* Stack the years in one dataset 2002-2007. */
data concrete0207;
 set concrete02_w_DD
     concrete07_w_DD;
run;


/*** Create impute flags and save the dataset. ****/

data concrete_0207 (keep =survu_id firmid                        
  tvs cm cp cf ee pe GE /* tab tae */ tce sw te ph ph_nl ph_diff ph_captrd_diff ph_nl_captrd_diff /* 
*/ ww nonpw pw empl_a wt exp  NAICS_NEW_6 industry /*
*/
  tvs_imp cm_imp cp_imp cf_imp ee_imp pe_imp /** tab_imp tae_imp */ tce_imp te_imp ph_imp ww_imp pw_imp /*
*/  tvs_f cm_f cp_f ee_f tab_f tae_f tce_f te_f te_nl_f sw_f ww_f ww_nl_f pw_f pw_nl_f ph_f cf_f ph_nl_f CEA_ddensity /*
*/ BEA_CEA_Code year elec_price FIPSST phpwratio HOURS_TOT_M mr mr_f br br_f eetvsrat eecmrat eeswrat eecprat cftvsrat cfcmrat cfswrat cfcprat);
set concrete0207;

 if TVS_F in (' B',' H',' L',' M',' J',' S',' V','RB','RE','RH','RL','RM','RJ','RS','RV') 
 then tvs_imp = 1;
 else tvs_imp = 0;

 if cm_F in (' B',' H',' L',' M',' S',' V','RB','RE','RH','RL','RM','RS')
               /* and cm_less_cost_mat_tot ne 0 */
 then cm_imp = 1;
 else cm_imp = 0;

 if cp_F in (' D',' H',' J',' L',' T',' V', 'RH','RJ','RL','RK','RT','RV')
 then cp_imp = 1;
 else cp_imp = 0;

 if ee_f in (' H',' V','RH','RV') 
 then ee_imp = 1;
 else ee_imp = 0;

 if ee_f in (' H',' V','RH','RV') 
 and pe_f in (' B','RB')
 then pe_imp = 1;
 else pe_imp = 0;

 if ee_f in (' V', 'RV') and ee ne 0
 then do;
   eetvsrat=ee/tvs;
   eecmrat=ee/cm;
   eeswrat=ee/sw;
   eecprat=ee/cp;
 end;

 if pe>0 then elec_price = ee/pe;
 else elec_price=.;

 if cf_f in (' H',' V','RH','RV') 
 then cf_imp = 1;
 else cf_imp = 0;

 if cf_f in (' V', 'RV') and cf ne 0
 then do;
   cftvsrat=cf/tvs;
   cfcmrat=cf/cm;
   cfswrat=cf/sw;
   cfcprat=cf/cp;
 end;


 if year=2002
 then do;
   if PH_F in (' B',' H',' M') and PH_NL_F in (' B',' H',' M','RB')
   THEN ph_imp = 1;
   else ph_imp = 0;

/* Make PH missing when TVS>0 and (PW>0 or WW>0 or TE>0) and PH=0.
   In 2002 the PH_F flag is missing in many cases.  These PH=0 cases are probably
   either imputed or misreported, since it's impossible to produce concrete 
   without labor. */

   if TAB_F = ' K' and TAE_F=' K' and TCE_F=' K' and TRT_F= ' K' 
   then tce_imp = 1;
   else tce_imp = 0;

   if te_nl_f in (' B',' H',' M',' V','RB','RH') 
   then te_nl_imp = 1;
   else te_nl_imp = 0; 


 end;
 else do;
   if PH_F in (' B',' H',' M','RB','RH','RL') 
   THEN ph_imp = 1;
   else ph_imp = 0;


   if TCE_F in (' B',' L','RL')
   then tce_imp = 1;
   else tce_imp = 0;
 end;

 if tvs>0 and ph=0 and (pw>0 or ww>0 or TE>0) then ph= . ;

 if PW_F IN (' B',' H',' J',' L',' M',' V','RB','RE','RH','RJ','RL','RM','RV') 
 THEN pw_imp = 1;
 else pw_imp = 0;

 if ww_f in (' B',' H',' M') 
 then ww_imp = 1;
 else ww_imp = 0;

 if te_f in (' B',' H','RB') 
 then te_imp = 1;
 else te_imp = 0; 


 nonpw = te - pw;

 ph_diff = ph - ph_nl;
 label ph_diff = "PH - PH_NL";

 ph_captrd_diff = ph - HOURS_TOT_M;
 label ph_captrd_diff = "PH - HOURS_TOT_M";

 ph_nl_captrd_diff = ph_nl - HOURS_TOT_M;
 label ph_nl_captrd_diff = "PH_NL - HOURS_TOT_M";


 NAICS_NEW_6 = substr(NAICS_NEW,1,6);
 industry='concrete';
 if pw>0 then phpwratio = ph/pw;  * ratio of production worker hours to production workers;
run;

proc sort data=concrete_0207; by year; run;

proc univariate data=concrete_0207;
 by year;
 var eetvsrat eecmrat eeswrat eecprat cftvsrat cfcmrat cfswrat cfcprat;
title "Ratios for ratio-imputed energy variables, ready-mix concrete";
run;


/***  
      MERGE THE 2002 AND 2007 TAB and TAE VARIABLES WITH THE PANEL DATASET
      SO THAT EACH PLANT-YEAR OBSERVATION HAS THE 2002 AND 2007 TAB AND TAE
      VARIABLES (WHICH WILL BE MISSING IF THE PLANT WAS NOT IN THE CMF
      IN 2002 OR 2007).  THIS WAY WE DON'T HAVE TO IMPUTE FOR THE TAB AND TAE
      VARIABLES IN NON-CENSUS YEARS.
***/


data concrete02_assets (keep = survu_id tab02 tae02 tab02_imp tae02_imp);
 set concrete02_w_DD;
   if TAB_F = ' K' and TAE_F=' K' and TCE_F=' K' and TRT_F= ' K' 
   then do;
     tab02_imp = 1;
     tae02_imp = 1;
   end;
   else do;
     tab02_imp = 0;
     tae02_imp = 0;
   end;
   tab02 = tab;
   tae02 = tae;
run;

data concrete07_assets (keep = survu_id tab07 tae07 tab07_imp tae07_imp);
 set concrete07_w_DD;
    if TAB_F in ( ' B', ' D',' J',' V')  
    then tab07_imp = 1;
    else tab07_imp = 0;
    if TAE_F in ( ' B', ' D',' J',' V')  
    then tae07_imp = 1;
    else tae07_imp = 0;
   tab07 = tab;
   tae07 = tae;
run;

proc sort data=concrete02_assets;
 by survu_id;
run;
proc sort data=concrete07_assets;
 by survu_id;
run;

proc sort data=concrete_0207;
 by survu_id year;
run;

data concrete_0207_w_CB_imputes;
 merge concrete_0207 (in=in0207) concrete02_assets (in=in02);
by survu_id;
if in0207 then output concrete_0207_w_CB_imputes;
run;

data concrete_0207_w_CB_imputes;
 merge concrete_0207_w_CB_imputes (in=in0207) concrete07_assets (in=in07);
by survu_id;
if in0207 then output concrete_0207_w_CB_imputes;
run;

/* Calculate median electricity price by CEA and year and use it to create
   cost of electricity for plants that generate all their own electricity. */

/**
proc sort data=concrete_0207_w_CB_imputes;
 by year BEA_CEA_Code;
run;

proc means data=concrete_0207_w_CB_imputes median;
 var elec_price;
by year BEA_CEA_Code;
output out=
*/

proc sort data=concrete_0207_w_CB_imputes;
 by year;
run;

proc univariate data=concrete_0207_w_CB_imputes;
 var ge;
by year;
run;

proc freq data= concrete_0207_w_CB_imputes;
 tables tvs_f cm_f cp_f ee_f cf_f tab_f tae_f br_f mr_f tce_f te_f te_nl_F sw_f ww_f ww_nl_f pw_f pw_nl_f ph_f PH_NL_F;
by year;
 title1 "Distribution of edit/impute flags, concrete";
 title2 "Tabulated non-AR plants, by year";
run;


proc means data=concrete_0207_w_CB_imputes N mean min p10 q1 median QRANGE stddev NOLABELS;
  var tvs sw te ph ww pw cm cp cf ee pe GE tab02 tae02 br mr tce;
where year=2002;
title1 "distribution of tabulated, non-AR concrete plants";
title2 "2002 Census, including Census Bureau imputes/edits";
run;

proc sort data=concrete_0207_w_CB_imputes; by ph_imp; run;

proc means data=concrete_0207_w_CB_imputes N mean min p10 q1 median QRANGE stddev NOLABELS;
  var tvs sw te ph pw pw_imp phpwratio ww ww_imp cm cf ee pe GE tab02 tae02 tce;
where year=2002;
by ph_imp;
title1 "distribution of tabulated, non-AR concrete plants, by ph_imp";
title2 "2002 Census, including Census Bureau imputes/edits";
run;

proc means data=concrete_0207_w_CB_imputes N mean min p10 q1 median QRANGE stddev NOLABELS;
  var tvs sw te ph ph_nl ph_diff ph_captrd_diff ph_nl_captrd_diff pw pw_imp phpwratio ww ww_imp cm cf ee pe GE tab02 tae02 tce;
where year=2002 and ph_f = "RS";
title1 "distribution of tabulated, non-AR concrete plants, with ph_f = RS ";
title2 "2002 Census, including Census Bureau imputes/edits";
run;


proc means data=concrete_0207_w_CB_imputes N mean min p10 q1 median QRANGE stddev NOLABELS;
  var tvs tvs_imp sw te te_imp ph ph_imp pw pw_imp phpwratio ww ww_imp cm cf ee pe GE tab02 tae02 tce;
where year=2002 and PH=0;
title1 "distribution of tabulated, non-AR concrete plants with PH=0";
title2 "2002 Census, including Census Bureau imputes/edits";
run;


data concrete.concrete_0207_w_CB_imputes;
 set concrete_0207_w_CB_imputes;
run;

data concrete.concrete_0207_gooddata (keep =survu_id firmid                        
     tvs cm cp cf ee energy energy_cost pe GE tab02 tae02 tab07 tae07 tce sw te ph ww pw nonpw empl_a wt  NAICS_NEW_6 industry CEA_ddensity BEA_CEA_Code year elec_price FIPSST mr br
     logtvs logcp loge logtae02 logtae07 logsw logph logpw logww logte);
 format ee 16.1 cf 16.1 PH 16.1;
set concrete_0207_w_CB_imputes;
 if tvs_imp = 1 then tvs = .;
 if cm_imp = 1 then cm = .;
 if cp_imp = 1 then cp = .;
 if (TVS>0 or TVS=.) and CP = 0 then cp = .;
 if cf_imp = 1 then cf = .;
 if ee_imp = 1 then do; ee = .; elec_price = .; end;
 /** We have many observations with EE=0 and CF=0 (and GE=0) even though TVS>0 and CM>0.
    It's impossible to produce concrete without energy, so presumably this means
    EE and/or CF are so small that they get rounded down to zero (units are thousands of dollars).
    In this case take a random draw from the UNIFORM(0,1) distribution.
 **/
 if TVS > 0 and EE = 0 and GE=0 then ee=uniform(-1); 
 if TVS > 0 and CF = 0 then CF=uniform(-1); 
 if pe_imp = 1 then pe = .;
 if tab02_imp = 1 then tab02 = .; 
 if tae02_imp = 1 then tae02 = .; 
 if tab07_imp = 1 then tab07 = .; 
 if tae07_imp = 1 then tae07 = .;
 if tab02=0 and tae02=0 and tvs>0 then do; tab02=.; tae02=.; end; 
 if tab07=0 and tae07=0 and tvs>0 then do; tab07=.; tae07=.; end; 
 if tce_imp = 1 then tce = .;
 if te_imp = 1 then te = .;
 if ph_imp = 1 then ph = .;
 if tvs>0 and ph=0 and (pw>0 or ww>0 or TE>0) then ph= . ;
 if ww_imp = 1 then ww = .;
 if pw_imp = 1 then pw = .;
 nonpw = te - pw;
 if ge=0 then energy = sum(cf,ee);
 else energy = sum(cf,ee,ge);

 energy_cost = cf + ee + ge;
 
 if tvs>0 then logtvs = log(tvs);
 else logtvs = .;
 if cp>0 then logcp = log(cp);
 else logcp = .;
 if energy>0 then loge = log(energy);
 else loge=.; 
 if tae02>0 then logtae02 = log(tae02);
 else logtae02 = .;
 if tae07>0 then logtae07 = log(tae07);
 else logtae07 = .;
 if sw>0 then logsw = log(sw);
 else logsw = .;
 if ph>0 then logph = log(ph); 
 else logph = .;
 if te>0 then logte = log(te);
 else logte = .;
 if pw>0 then logpw = log(pw);
 else logpw = .;
 if ww>0 then logww = log(ww);
 else logww = .;
run;


